Load necessary packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(RColorBrewer)
library(treemap)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Choose a color palette

colors <- brewer.pal(9, "YlGnBu")

Load Dataset

budget_data <- read.csv('bcl14.csv')

check dimensions of data

dim(budget_data)
## [1] 270   6

Check for missing values in the entire dataset

sum(is.na(budget_data))
## [1] 0

Check for missing values in all columns

sapply(budget_data, function(x) sum(is.na(x)))
##                        Fund                  Department 
##                           0                           0 
##                    BCL.Code                    BCL.Name 
##                           0                           0 
##                 BCL.Purpose X2014.Expenditure.Allowance 
##                           0                           0

duplicate check

sum(duplicated(budget_data))
## [1] 0

View structure of data

str(budget_data)
## 'data.frame':    270 obs. of  6 variables:
##  $ Fund                       : chr  "General Subfund" "General Subfund" "General Subfund" "General Subfund" ...
##  $ Department                 : chr  "Civil Service Commissions" "Department of Neighborhoods" "Department of Neighborhoods" "Department of Neighborhoods" ...
##  $ BCL.Code                   : chr  "V1CIV" "I3100" "I3200" "I3300" ...
##  $ BCL.Name                   : chr  "Civil Service Commissions" "Director's Office" "Internal Operations" "Community Building" ...
##  $ BCL.Purpose                : chr  "The purpose of the Civil Service Commissions Budget Control Level is to provide administrative support to the P"| __truncated__ "The purpose of the Director's Office Budget Control Level is to provide executive leadership, communications, a"| __truncated__ "The purpose of the Internal Operations Budget Control Level is to provide financial, human resources, facility,"| __truncated__ "The purpose of the Community Building Budget Control Level is to deliver technical assistance, support services"| __truncated__ ...
##  $ X2014.Expenditure.Allowance: num  385887 485705 1426675 3458080 5631045 ...

View summary of data

summary(budget_data)
##      Fund            Department          BCL.Code           BCL.Name        
##  Length:270         Length:270         Length:270         Length:270        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  BCL.Purpose        X2014.Expenditure.Allowance
##  Length:270         Min.   :     1999          
##  Class :character   1st Qu.:  1309259          
##  Mode  :character   Median :  4850475          
##                     Mean   : 17326490          
##                     3rd Qu.: 13468717          
##                     Max.   :305641502

Correct data types

budget_data$Fund <- as.factor(budget_data$Fund)
budget_data$Department <- as.factor(budget_data$Department)
budget_data$BCL.Code <- as.factor(budget_data$BCL.Code)
budget_data$BCL.Name <- as.factor(budget_data$BCL.Name)
budget_data$BCL.Purpose <- as.factor(budget_data$BCL.Purpose)

Outlier check

boxplot(budget_data$Fund, main = "Fund", ylab = "Expenditure Allowance", col = "steelblue")

boxplot(budget_data$Department, main = "Department", ylab = "Expenditure Allowance", col = "darkgreen")

boxplot(budget_data$BCL.Code, main = "BCL Code", ylab = "Expenditure Allowance", col = "purple")

boxplot(budget_data$BCL.Name, main = "BCL Name", ylab = "Expenditure Allowance", col = "red")

boxplot(budget_data$BCL.Purpose, main = "BCL Purpose", ylab = "Expenditure Allowance", col = "orange")

boxplot(
  budget_data$`X2014.Expenditure.Allowance`,
  main = "2014 Expenditure Allowance",
  ylab = "Expenditure Allowance",
  col = "blue",
  boxwex = 0.5, # Narrower box
  outpch = 20, # Solid circle for outlier points
  outcol = "red" # Red color for outlier points
)

Create new variable for total expenditure

budget_data$total_expenditure <- budget_data$`X2014.Expenditure.Allowance`

Summarize total expenditure by department

department_summary <- budget_data %>%
  group_by(Department) %>%
  summarise(total_expenditure = sum(total_expenditure)) %>%
  arrange(desc(total_expenditure))

Create a vector of custom colors

my_colors <- c("steelblue", "orange", "green", "purple", "red", "blue", "yellow", "brown", "grey", "pink")

Create a bar plot of top 10 departments by total expenditure

ggplot(head(department_summary, 10), aes(x = Department, y = total_expenditure, fill = Department)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = my_colors) +  # Use custom colors
  xlab("Department") +
  ylab("Total Expenditure") +
  ggtitle("Top 10 Departments by Total Expenditure in 2014") 

Summarize total expenditure by fund

fund_summary <- budget_data %>%
  group_by(Fund) %>%
  summarise(total_expenditure = sum(total_expenditure)) %>%
  arrange(desc(total_expenditure))

Calculate the total expenditure allowance for each BCL code

expenditure_by_bcl <- budget_data %>%
  group_by(BCL.Code) %>%
  summarize(total_expenditure = sum(`X2014.Expenditure.Allowance`))

Create a bar plot without key

ggplot(expenditure_by_bcl, aes(x = BCL.Code, y = total_expenditure, fill = BCL.Code)) +
  geom_bar(stat = "identity") +
  xlab("BCL Code") +
  ylab("Total Expenditure Allowance") +
  ggtitle("Distribution of Expenditure Allowance by BCL Code") +
  scale_fill_hue(l = 40, c = 80) + # adds a color scale
  guides(fill = FALSE) # removes the legend/key
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

count unique elements

length(unique(budget_data$BCL.Code))
## [1] 250

Create the scatterplot

ggplot(budget_data, aes(x = `X2014.Expenditure.Allowance`, y = Department, color = Department)) +
  geom_point() +
  scale_color_hue(l = 40, c = 80) + # adds a color scale
  xlab("Expenditure Allowance") +
  ylab("Department Count") +
  ggtitle("Relationship Between Expenditure Allowance and Department Count")

Create a density plot without key

ggplot(budget_data, aes(x = `X2014.Expenditure.Allowance`, fill = Department)) +
  geom_density(alpha = 0.5) +
  xlab("Expenditure Allowance (in R)") +
  ylab("Density") +
  ggtitle("Expenditure Allowance by Department") +
  scale_fill_hue(l = 40, c = 80) + # adds a color scale
  theme_classic() +
  guides(fill = FALSE) # removes the legend/key
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

Create a summary table with counts of departments by fund and expenditure allowance

dept_summary <- budget_data %>%
  group_by(Fund, `X2014.Expenditure.Allowance`) %>%
  summarize(count = n())
## `summarise()` has grouped output by 'Fund'. You can override using the
## `.groups` argument.

Create a heatmap

ggplot(dept_summary, aes(x = `X2014.Expenditure.Allowance`, y = Fund, fill = count)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(x = "Expenditure Allowance", y = "Fund", title = "Department Distribution by Fund and Expenditure Allowance")

Define the expenditure allowance ranges

budget_data$ranges <- cut(budget_data$X2014.Expenditure.Allowance, breaks = seq(0, 8000000, by = 1000000))

Create a summary table with counts of departments by fund and expenditure allowance range

dept_summary <- budget_data %>%
  group_by(Fund, ranges) %>%
  summarize(count = n())
## `summarise()` has grouped output by 'Fund'. You can override using the
## `.groups` argument.

Create a stacked bar chart

ggplot(dept_summary, aes(x = Fund, y = count, fill = ranges)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("#FF9999", "#FF6666", "#FF3333", "#FF0000", "#CC0000", "#990000", "#660000", "#330000")) +
  labs(x = "Fund", y = "Number of Departments", title = "Number of Departments by Fund and Expenditure Allowance Ranges") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

highest expenditure allowance in department

expenditure_by_dept <- budget_data %>%
  group_by(Department) %>%
  summarize(total_expenditure = sum(`X2014.Expenditure.Allowance`)) %>%
  arrange(desc(total_expenditure))

Create the bar chart

ggplot(expenditure_by_dept, aes(x = Department, y = total_expenditure, fill = total_expenditure)) +
  geom_bar(stat = "identity") +
  scale_fill_gradientn(colors = colors, guide = FALSE) +
  xlab("Department") +
  ylab("Total Expenditure Allowance") +
  ggtitle("Total Expenditure Allowance by Department in 2014") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Create bubble chart

ggplot(budget_data, aes(x = `X2014.Expenditure.Allowance`, y = Department, size = `X2014.Expenditure.Allowance`, color = Department)) +
  geom_point(alpha = 0.7) +
  scale_color_hue(l = 40, c = 80) + # adds a color scale
  xlab("Expenditure Allowance (in R)") +
  ylab("Department") +
  ggtitle("Expenditure Allowance by Department") +
  theme_classic()

Create bubble chart

ggplot(department_summary, aes(x = total_expenditure, y = Department, size = total_expenditure, color = Department)) +
  geom_point(alpha = 0.7) +
  scale_color_hue(l = 40, c = 80) + # adds a color scale
  xlab("Total Expenditure (in R)") +
  ylab("Department") +
  ggtitle("Total Expenditure by Department") +
  theme_classic()

Aggregate the data by BCL code and department

agg_data <- aggregate(X2014.Expenditure.Allowance ~ BCL.Code + Department, data = budget_data, sum)

Create the treemap

treemap(agg_data, 
        index = c("Department", "BCL.Code"), 
        vSize = "X2014.Expenditure.Allowance", 
        type = "index",
        palette=brewer.pal(9, "Set1"),
        title = "Seattle 2014 Endorsed Budget by BCL Code and Department")

budget_data$Fund <- as.numeric(budget_data$Fund)
budget_data$Department <- as.numeric(budget_data$Department)

plot_ly(budget_data, x = ~Fund, y = ~Department, z = ~`X2014.Expenditure.Allowance`, color = ~`X2014.Expenditure.Allowance`,
        marker = list(size = 7, opacity = 0.8, colorscale = 'Viridis')) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = "Fund"),
                      yaxis = list(title = "Department"),
                      zaxis = list(title = "2014 Expenditure Allowance")))